import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json
import numpy as np
import warnings
warnings.filterwarnings("ignore")
df1 = pd.read_csv("valeursfoncieres-2019.txt", sep="|")
df2 = pd.read_csv("valeursfoncieres-2020.txt", sep="|")
df3 = pd.read_csv("valeursfoncieres-2021.txt", sep="|")
df = pd.concat([df1, df2, df3])
save = df.copy()
COLUMNS = list(df.columns)
ROWS = df.shape[0]
COLS = df.shape[1]
df.shape
(10483168, 43)
On a un jeu de données très volumineux, en effet on a plus de 10 million de lignes
df.head()
| Code service CH | Reference document | 1 Articles CGI | 2 Articles CGI | 3 Articles CGI | 4 Articles CGI | 5 Articles CGI | No disposition | Date mutation | Nature mutation | Valeur fonciere | No voie | B/T/Q | Type de voie | Code voie | Voie | Code postal | Commune | Code departement | Code commune | Prefixe de section | Section | No plan | No Volume | 1er lot | Surface Carrez du 1er lot | 2eme lot | Surface Carrez du 2eme lot | 3eme lot | Surface Carrez du 3eme lot | 4eme lot | Surface Carrez du 4eme lot | 5eme lot | Surface Carrez du 5eme lot | Nombre de lots | Code type local | Type local | Identifiant local | Surface reelle bati | Nombre pieces principales | Nature culture | Nature culture speciale | Surface terrain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 04/01/2019 | Vente | 37220,00 | 26.000 | NaN | RUE | 2730 | DE MONTHOLON | 1000.000 | BOURG-EN-BRESSE | 1 | 53 | NaN | AI | 298 | NaN | 8.000 | 19,27 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 2.000 | Appartement | NaN | 20.000 | 1.000 | NaN | NaN | NaN |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 04/01/2019 | Vente | 185100,00 | 22.000 | B | RUE | 1650 | GEN DELESTRAINT | 1000.000 | BOURG-EN-BRESSE | 1 | 53 | NaN | AM | 95 | NaN | 7.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 3.000 | Dépendance | NaN | 0.000 | 0.000 | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 04/01/2019 | Vente | 185100,00 | 22.000 | NaN | RUE | 1650 | GEN DELESTRAINT | 1000.000 | BOURG-EN-BRESSE | 1 | 53 | NaN | AM | 95 | NaN | 137.000 | NaN | 154.000 | 61,51 | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 2.000 | Appartement | NaN | 62.000 | 3.000 | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 08/01/2019 | Vente | 209000,00 | 3.000 | NaN | RUE | 0043 | DES CHAMPAGNES | 1160.000 | PRIAY | 1 | 314 | NaN | E | 1676 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 1.000 | Maison | NaN | 90.000 | 4.000 | S | NaN | 940.000 |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 07/01/2019 | Vente | 134900,00 | 5.000 | NaN | LOT | A003 | LE BIOLAY | 1370.000 | SAINT-ETIENNE-DU-BOIS | 1 | 350 | NaN | AA | 11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 1.000 | Maison | NaN | 101.000 | 5.000 | S | NaN | 490.000 |
On a beaucoup de NaN dans le jeu de données. Il faut regarder comment ils sont interprétés par pandas.
df.describe()
| Code service CH | Reference document | 1 Articles CGI | 2 Articles CGI | 3 Articles CGI | 4 Articles CGI | 5 Articles CGI | No disposition | No voie | Code postal | Code commune | Prefixe de section | No plan | 5eme lot | Nombre de lots | Code type local | Identifiant local | Surface reelle bati | Nombre pieces principales | Surface terrain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 10483168.000 | 6421609.000 | 10385542.000 | 10483168.000 | 466379.000 | 10483168.000 | 20572.000 | 10483168.000 | 5913229.000 | 0.000 | 5906474.000 | 5906474.000 | 7171580.000 |
| mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.220 | 742.348 | 50766.050 | 207.933 | 425.779 | 414.316 | 124.463 | 0.412 | 2.085 | NaN | 79.276 | 2.212 | 2835.086 |
| std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.882 | 2063.104 | 27357.530 | 167.519 | 312.993 | 581.474 | 1816.576 | 0.853 | 0.945 | NaN | 684.735 | 2.119 | 13656.028 |
| min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.000 | 1.000 | 1000.000 | 1.000 | 1.000 | 1.000 | 2.000 | 0.000 | 1.000 | NaN | 0.000 | 0.000 | 0.000 |
| 25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.000 | 8.000 | 29510.000 | 73.000 | 162.000 | 87.000 | 8.000 | 0.000 | 1.000 | NaN | 0.000 | 0.000 | 235.000 |
| 50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.000 | 25.000 | 49430.000 | 172.000 | 305.000 | 226.000 | 23.000 | 0.000 | 2.000 | NaN | 53.000 | 2.000 | 606.000 |
| 75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.000 | 99.000 | 75016.000 | 298.000 | 819.000 | 508.000 | 63.000 | 1.000 | 3.000 | NaN | 90.000 | 4.000 | 1750.000 |
| max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 694.000 | 9999.000 | 97490.000 | 909.000 | 950.000 | 9844.000 | 191613.000 | 233.000 | 4.000 | NaN | 312962.000 | 109.000 | 10723091.000 |
Même remarque qu'au dessus.
dico = {"nom_colonne": [],
"valeur": []}
for col in COLUMNS:
dico["nom_colonne"].append(col)
dico["valeur"].append(df[df[col].isna()].shape[0])
dico = pd.DataFrame(dico)
dico["pourcentage"] = dico["valeur"] / ROWS * 100
dico.sort_values(by="valeur", inplace=True)
fig = px.bar(dico, x="nom_colonne", y='pourcentage', color="pourcentage", color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_xaxes(tickangle=45)
fig.update_layout(title="Bar chart sur la distribution des pourcentages de données manquantes par colonne",
xaxis_title="Noms des colonnes",
yaxis_title="Pourcentages de données manquantes",
title_x=0.5)
fig.show()
list(dico[dico["pourcentage"] ==100].nom_colonne)
['1 Articles CGI', '2 Articles CGI', '3 Articles CGI', 'Identifiant local', '4 Articles CGI', '5 Articles CGI', 'Reference document', 'Code service CH']
Ces colonnes n'ont pas du tout de données. On va les supprimer pour simplifier l'analyse.
dtypes = df.dtypes.reset_index()
dtypes.rename(columns = {"index": "nom_col", 0: "type"}, inplace=True)
distribution = dtypes.type.value_counts().reset_index()
distribution.rename(columns = {"index": "type", "type": "occurence"}, inplace=True)
distribution.sort_values(by="type", inplace=True)
distribution["type"] = distribution["type"].astype(str)
traduction_type = {"int64": "entier", "float64": "décimal", "object": "objet"}
distribution["type"] = distribution["type"].map(lambda x: traduction_type[x])
distribution.sort_values(by="type", inplace=True)
fig = px.pie(distribution, values="occurence", names="type", color="type", color_discrete_map={"entier": "green",
"décimal": "orange",
"objet": "red"})
fig.update_layout(title="Pie chart sur la distribution des types de données", showlegend=False, title_x=0.5)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
df.select_dtypes(include='object').head()
| Date mutation | Nature mutation | Valeur fonciere | B/T/Q | Type de voie | Code voie | Voie | Commune | Code departement | Section | No Volume | 1er lot | Surface Carrez du 1er lot | 2eme lot | Surface Carrez du 2eme lot | 3eme lot | Surface Carrez du 3eme lot | 4eme lot | Surface Carrez du 4eme lot | Surface Carrez du 5eme lot | Type local | Nature culture | Nature culture speciale | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 04/01/2019 | Vente | 37220,00 | NaN | RUE | 2730 | DE MONTHOLON | BOURG-EN-BRESSE | 1 | AI | NaN | 8.000 | 19,27 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Appartement | NaN | NaN |
| 1 | 04/01/2019 | Vente | 185100,00 | B | RUE | 1650 | GEN DELESTRAINT | BOURG-EN-BRESSE | 1 | AM | NaN | 7.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Dépendance | NaN | NaN |
| 2 | 04/01/2019 | Vente | 185100,00 | NaN | RUE | 1650 | GEN DELESTRAINT | BOURG-EN-BRESSE | 1 | AM | NaN | 137.000 | NaN | 154.000 | 61,51 | NaN | NaN | NaN | NaN | NaN | Appartement | NaN | NaN |
| 3 | 08/01/2019 | Vente | 209000,00 | NaN | RUE | 0043 | DES CHAMPAGNES | PRIAY | 1 | E | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Maison | S | NaN |
| 4 | 07/01/2019 | Vente | 134900,00 | NaN | LOT | A003 | LE BIOLAY | SAINT-ETIENNE-DU-BOIS | 1 | AA | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Maison | S | NaN |
def nettoyage(df):
df["Date mutation"] = pd.to_datetime(df["Date mutation"])
df.drop(['1 Articles CGI','2 Articles CGI',
'3 Articles CGI','Identifiant local',
'4 Articles CGI','5 Articles CGI',
'Reference document','Code service CH'], axis=1, inplace=True)
df["Valeur fonciere"] = df["Valeur fonciere"].map(lambda x: str(x).replace(",", ".")).astype(float)
df = df[(~df["No Volume"].str.contains("[a-zA-Z]",na=False))]
df["No Volume"] = df["No Volume"].astype(float)
df = df[(~df["1er lot"].str.contains("[a-zA-Z]", na=False))]
df["1er lot"] = df["1er lot"].astype(float)
df["Surface Carrez du 1er lot"] = df["Surface Carrez du 1er lot"].map(lambda x: str(x).replace(",", ".")).astype(float)
df = df[(~df["2eme lot"].str.contains("[a-zA-Z]", na=False))]
df["2eme lot"] = df["2eme lot"].astype(float)
df["Surface Carrez du 2eme lot"] = df["Surface Carrez du 2eme lot"].map(lambda x: str(x).replace(",", ".")).astype(float)
df = df[(~df["3eme lot"].str.contains("[a-zA-Z]", na=False))]
df["3eme lot"] = df["3eme lot"].astype(float)
df["Surface Carrez du 3eme lot"] = df["Surface Carrez du 3eme lot"].map(lambda x: str(x).replace(",", ".")).astype(float)
df = df[(~df["4eme lot"].str.contains("[a-zA-Z]", na=False))]
df["4eme lot"] = df["4eme lot"].astype(float)
df["Surface Carrez du 4eme lot"] = df["Surface Carrez du 4eme lot"].map(lambda x: str(x).replace(",", ".")).astype(float)
df["Surface Carrez du 5eme lot"] = df["Surface Carrez du 5eme lot"].map(lambda x: str(x).replace(",", ".")).astype(float)
cols = df.select_dtypes(include='int64').columns
for col in cols:
df[col] = df[col].astype(float)
return df
df_net = nettoyage(df)
dtypes_net = df_net.dtypes.reset_index()
dtypes_net.rename(columns = {"index": "nom_col", 0: "type"}, inplace=True)
distribution_net = dtypes_net.type.value_counts().reset_index()
distribution_net.rename(columns = {"index": "type", "type": "occurence"}, inplace=True)
traduction_type = {"int64": "entier", "float64": "décimal", "object": "objet", "datetime64[ns]": "date"}
distribution_net["type"] = distribution_net["type"].astype(str)
distribution_net["type"] = distribution_net["type"].map(lambda x: traduction_type[x])
distribution_net.sort_values(by="type", inplace=True)
fig = px.pie(distribution_net, values="occurence", names="type", color="type", color_discrete_map={"entier": "green",
"décimal": "orange",
"objet": "red",
"date": "blue"})
fig.update_layout(title="Pie chart sur la distribution des types de données", showlegend=False, title_x=0.5)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])
colors=['orange', 'green', 'red']
values = list(distribution.occurence)
labels = list(distribution.type)
fig.add_trace(go.Pie(values=values, labels=labels, marker=dict(colors=colors), title="Avant"), row=1, col=1)
values = list(distribution_net.occurence)
labels = list(distribution_net.type)
colors=["blue","orange", "red"]
fig.add_trace(go.Pie(values=values, labels=labels, marker=dict(colors=colors), title="Après"), row=1, col=2)
fig.update_layout(title="Vues avant/après", showlegend=False, title_x=0.5)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
On remarque que ce jeu de données a plus de colonnes numériques que catégorielles.
NOM_MOIS = {
1: "Janvier",
2: "Février",
3: "Mars",
4: "Avril",
5: "Mai",
6: "Juin",
7: "Juillet",
8: "Août",
9: "Septembre",
10: "Octobre",
11: "Novembre",
12: "Décembre"
}
NUM_MOIS = {v: k for k,v in NOM_MOIS.items()}
def nature_culture(x):
NATURE_CULTURE = {
"S": "sols",
"J": "jardins",
"AG": "terrains d'agrément",
"AB": "terrains à bâtir",
"BT": "taillis simples",
"T": "terres",
"P": "prés",
"VE": "vergers",
"L": "landes",
"E": "eaux",
"BS": "taillis sous futaie",
"B": "bois",
"BR": "futaies résineuses",
"BP": "peupleraies",
"LB": "landes boisées",
"BF": "futaies feuillues",
"VI": "vignes",
"CH": "chemin de fer",
"CA": "carrières",
"BM": "futaies mixtes",
"BO": "oseraies"
}
try: return NATURE_CULTURE[x]
except: return x
def geo(df_net):
def format_code_departement(x):
x = str(x)
if len(x) == 1:
return f"0{x}"
else:
return x
df_net["Code departement"] = df_net["Code departement"].map(format_code_departement)
regions = pd.read_csv("region_2022.csv")
regions = regions[["REG", "LIBELLE"]]
regions.rename(columns={"REG": "Code region", "LIBELLE": "Nom region"}, inplace=True)
departements = pd.read_csv("departement_2022.csv")
departements = departements[["DEP", "REG", "LIBELLE"]]
departements.rename(columns={"DEP": "Code departement", "REG": "Code region", "LIBELLE": "Nom departement"}, inplace=True)
dico_code_region = {}
dico_nom_dept = {}
for index, row in departements.iterrows():
dico_code_region[row["Code departement"]] = row["Code region"]
dico_nom_dept[row["Code departement"]] = row["Nom departement"]
dico_nom_reg = {}
for index, row in regions.iterrows():
dico_nom_reg[row["Code region"]] = row["Nom region"]
dico_nom_zone = {"01": "Montagne & Massif",
"02": "Plaine",
"03": "Montagne & Massif",
"04": "Montagne & Massif",
"05": "Montagne & Massif",
"06": "Littoral",
"07": "Montagne & Massif",
"08": "Plaine",
"09": "Montagne & Massif",
"10": "Plaine",
"11": "Littoral",
"12": "Montagne & Massif",
"13": "Littoral",
"14": "Littoral",
"15": "Montagne & Massif",
"16": "Plaine",
"17": "Littoral",
"18": "Plaine",
"19": "Montagne & Massif",
"21": "Montagne & Massif",
"22": "Littoral",
"23": "Montagne & Massif",
"24": "Plaine",
"25": "Montagne & Massif",
"26": "Montagne & Massif",
"27": "Littoral",
"28": "Plaine",
"29": "Littoral",
"2A": "Île",
"2B": "Île",
"30": "Littoral",
"31": "Montagne & Massif",
"32": "Plaine",
"33": "Littoral",
"34": "Littoral",
"35": "Littoral",
"36": "Plaine",
"37": "Plaine",
"38": "Montagne & Massif",
"39": "Montagne & Massif",
"40": "Littoral",
"41": "Loir-et-Cher",
"42": "Montagne & Massif",
"43": "Montagne & Massif",
"44": "Littoral",
"45": "Plaine",
"46": "Montagne & Massif",
"47": "Lot-et-Garonne",
"48": "Montagne & Massif",
"49": "Plaine",
"50": "Littoral",
"51": "Plaine",
"52": "Plaine",
"53": "Plaine",
"54": "Montagne & Massif",
"55": "Plaine",
"56": "Littoral",
"57": "Montagne & Massif",
"58": "Montagne & Massif",
"59": "Littoral",
"60": "Plaine",
"61": "Plaine",
"62": "Littoral",
"63": "Montagne & Massif",
"64": "Littoral",
"65": "Montagne & Massif",
"66": "Littoral",
"67": "Montagne & Massif",
"68": "Montagne & Massif",
"69": "Montagne & Massif",
"70": "Montagne & Massif",
"71": "Montagne & Massif",
"72": "Plaine",
"73": "Montagne & Massif",
"74": "Montagne & Massif",
"75": "Plaine",
"76": "Littoral",
"77": "Plaine",
"78": "Plaine",
"79": "Plaine",
"80": "Littoral",
"81": "Montagne & Massif",
"82": "Plaine",
"83": "Littoral",
"84": "Montagne & Massif",
"85": "Littoral",
"86": "Plaine",
"87": "Montagne & Massif",
"88": "Montagne & Massif",
"89": "Montagne & Massif",
"90": "Montagne & Massif",
"91": "Plaine",
"92": "Plaine",
"93": "Plaine",
"94": "Plaine",
"95": "Plaine",
"971": "Île",
"972": "Île",
"973": "Hors France métropolitaine",
"974": "Île",
"976": "Île"}
df_net["Nom departement"] = df_net["Code departement"].map(lambda x: dico_nom_dept[x])
df_net["Code region"] = df_net["Code departement"].map(lambda x: dico_code_region[x])
df_net["Nom region"] = df_net["Code region"].map(lambda x: dico_nom_reg[x])
df_net["Nom zone"] = df_net["Code departement"].map(lambda x: dico_nom_zone[x])
df_net["Date mutation"] = pd.to_datetime(df_net["Date mutation"])
return df_net
df_geo = geo(df_net)
df_geo["Mois"] = df_geo["Date mutation"].dt.month
df_geo["Mois"] = df_geo["Mois"].map(lambda x: NOM_MOIS[x])
df_geo["Annee"] = df_geo["Date mutation"].dt.year
query = df_geo[(df_geo["Nom region"]=="Île-de-France")]
query = query[(query["Nature mutation"]=="Vente") & (query["Type local"]=="Maison") & (query["Annee"]==2021)]
query = query.groupby(["Nom departement", "Mois"])["Commune"].count()
query = query.reset_index()
query.rename(columns={"Commune": "Nb transactions"}, inplace=True)
query["filter"] = query["Mois"].map(lambda x: NUM_MOIS[x])
query.sort_values(by="filter", inplace=True)
query_2 = query.groupby("Mois")["Nb transactions"].mean().reset_index()
query_2.rename(columns={"Nb transactions": "moyenne"}, inplace=True)
query_2["filter"] = query_2["Mois"].map(lambda x: NUM_MOIS[x])
query_2 = query_2.sort_values(by="filter", inplace=False)
fig = px.bar(query, x="Mois", y="Nb transactions", color="Nom departement", barmode="group")
x = list(query_2["Mois"])
y = list(query_2["moyenne"])
fig.add_trace(go.Scatter(x=x, y=y, line=dict(color="firebrick"), name="moyenne"))
fig.update_layout(title="Bar chart sur la distribution du nombre de maisons vendus par mois et par département en 2021",
xaxis_title="Mois",
yaxis_title="Nombre de transactions",
title_x=0.5)
fig.show()
query = df_geo[(df_geo["Nom region"]=="Provence-Alpes-Côte d'Azur")]
query = query[(query["Nature mutation"]=="Vente") & (query["Type local"]=="Appartement") & (query["Annee"]==2019)]
query = query.groupby(["Nom departement", "Mois"])["Commune"].count()
query = query.reset_index()
query.rename(columns={"Commune": "Nb transactions"}, inplace=True)
query["filter"] = query["Mois"].map(lambda x: NUM_MOIS[x])
query.sort_values(by="filter", inplace=True)
query_2 = query.groupby("Mois")["Nb transactions"].mean().reset_index()
query_2.rename(columns={"Nb transactions": "moyenne"}, inplace=True)
query_2["filter"] = query_2["Mois"].map(lambda x: NUM_MOIS[x])
query_2 = query_2.sort_values(by="filter", inplace=False)
fig = px.bar(query, x="Mois", y="Nb transactions", color="Nom departement", barmode="group")
x = list(query_2["Mois"])
y = list(query_2["moyenne"])
fig.add_trace(go.Scatter(x=x, y=y, line=dict(color="firebrick"), name="moyenne"))
fig.update_layout(title="Bar chart sur la distribution du nombre d'appartements vendus par mois et par département en 2021 dans la région PACA",
xaxis_title="Mois",
yaxis_title="Nombre de transactions",
title_x=0.5)
fig.show()
query = df_geo[(df_geo["Type local"]=="Maison") & (df_geo["Annee"]==2021) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query = query.groupby("Nom departement")["Surface reelle bati"].mean().reset_index()
query.rename(columns={"Surface reelle bati": "Surface moyenne"}, inplace=True)
with open("departements.geojson") as f:
departements_geojson=json.load(f)
fig = px.choropleth_mapbox(query, departements_geojson, color="Surface moyenne", locations="Nom departement", featureidkey="properties.nom",
center={"lat": 46.2321, "lon": 2.2096},
mapbox_style="carto-positron", zoom=4.5, opacity=0.7,
hover_name="Nom departement",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
query = df_geo[(df_geo["Type local"]=="Maison") & (df_geo["Annee"]==2021) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query = query.groupby("Nom departement")["Surface terrain"].mean().reset_index()
query.rename(columns={"Surface terrain": "Surface moyenne"}, inplace=True)
fig = px.choropleth_mapbox(query, departements_geojson, color="Surface moyenne", locations="Nom departement", featureidkey="properties.nom",
center={"lat": 46.2321, "lon": 2.2096},
mapbox_style="carto-positron", zoom=4.5, opacity=0.7,
hover_name="Nom departement",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
query = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Nombre pieces principales"]==3) & (df_geo["Annee"]==2021) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query = query.groupby("Nom departement")["Surface reelle bati"].mean().reset_index()
query.rename(columns={"Surface reelle bati": "Surface moyenne"}, inplace=True)
fig = px.choropleth_mapbox(query, departements_geojson, color="Surface moyenne", locations="Nom departement", featureidkey="properties.nom",
center={"lat": 46.2321, "lon": 2.2096},
mapbox_style="carto-positron", zoom=4.5, opacity=0.7,
hover_name="Nom departement",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
df_geo["Nom zone"] = df_geo["Nom zone"].map(lambda x: "Littoral" if x=="Littotal" else x)
query = df_geo[(df_geo["Type local"]=="Maison") & (df_geo["Nom zone"]=="Littoral") & (df_geo["Annee"]==2021) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query = query.groupby("Nom departement")["Surface terrain"].mean().reset_index()
query.rename(columns={"Surface terrain": "Surface moyenne"}, inplace=True)
fig = px.choropleth_mapbox(query, departements_geojson, color="Surface moyenne", locations="Nom departement", featureidkey="properties.nom",
center={"lat": 46.2321, "lon": 2.2096},
mapbox_style="carto-positron", zoom=4.5, opacity=0.7,
hover_name="Nom departement",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
query = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Nom zone"]=="Montagne & Massif") & (df_geo["Annee"]==2021) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query = query.groupby("Nom departement")["Surface terrain"].mean().reset_index()
query.rename(columns={"Surface terrain": "Surface moyenne"}, inplace=True)
fig = px.choropleth_mapbox(query, departements_geojson, color="Surface moyenne", locations="Nom departement", featureidkey="properties.nom",
center={"lat": 46.2321, "lon": 2.2096},
mapbox_style="carto-positron", zoom=4.5, opacity=0.7,
hover_name="Nom departement",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
gamme:
annee = 2021
gamme = "studio"
query = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Annee"]==annee) & (df_geo["Surface terrain"].isna())]
query["Prix_m²"] = query["Valeur fonciere"] / query["Surface reelle bati"]
if gamme == "studio":
query = query[query["Surface reelle bati"] < 32]
elif gamme == "prestige":
query = query[query["Surface reelle bati"] > 200]
elif gamme == "standard":
query = query[query["Surface reelle bati"].between(32, 95)]
else:
query = query[query["Surface reelle bati"].between(95, 200)]
query = query.groupby("Nom region")["Prix_m²"].mean().reset_index()
query = query[~query["Nom region"].isin(["Guadeloupe", "Guyane", "La Réunion", "Martinique"])]
query.sort_values(by="Prix_m²", inplace=True, ascending=False)
fig = px.bar(query, x="Prix_m²", y="Nom region", color="Nom region")
fig.update_layout(title=f"Classement des régions selon le prix du m² lors de l'année {annee}",
xaxis_title="Prix moyen par transaction",
yaxis_title="Département",
title_x=0.5,
showlegend=False)
fig.show()
annee=2021
n=10
query = df_geo[(df_geo["Annee"]==annee) & (df_geo["Nature mutation"]=="Vente")]
query = query.groupby(["Nom departement"])["Valeur fonciere"].agg(["sum", "count"]).reset_index()
query["Prix moyen par transaction"] = query["sum"] / query["count"]
query.sort_values(by="Prix moyen par transaction", inplace=True, ascending=False)
query = query.head(n)
fig = px.bar(query, x="Prix moyen par transaction", y="Nom departement", color="Nom departement")
fig.update_layout(title=f"Bar chart sur le top {n} des départements les plus chers de l'année {annee}",
xaxis_title="Prix moyen par transaction",
yaxis_title="Département",
title_x=0.5,
showlegend=False)
fig.show()
def calcul_taux_evolution(valeur_depart, valeur_arrivee):
return (valeur_arrivee - valeur_depart) / valeur_depart * 100
annee_arr = 2021
annee_dep = 2020
query_arr = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Annee"]==annee_arr) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query_arr = query_arr.groupby("Nom departement")["Valeur fonciere"].mean().reset_index()
query_dep = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Annee"]==annee_dep) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query_dep = query_dep.groupby("Nom departement")["Valeur fonciere"].mean().reset_index()
query_dep.rename(columns={"Valeur fonciere": f"Valeur moyenne {annee_dep}"}, inplace=True)
query_arr.rename(columns={"Valeur fonciere": f"Valeur moyenne {annee_arr}", "Nom departement": "dep"}, inplace=True)
query = pd.concat([query_dep, query_arr], axis=1)
query["Taux d'évolution"] = query.apply(lambda x: calcul_taux_evolution(x[f"Valeur moyenne {annee_dep}"], x[f"Valeur moyenne {annee_arr}"]), axis=1)
query.drop(["dep", f"Valeur moyenne {annee_arr}", f"Valeur moyenne {annee_dep}"], axis=1, inplace=True)
fig = px.choropleth_mapbox(query, departements_geojson, color="Taux d'évolution", locations="Nom departement", featureidkey="properties.nom",
center={"lat": 46.2321, "lon": 2.2096},
mapbox_style="carto-positron", zoom=4.5, opacity=0.7,
hover_name="Nom departement",
color_continuous_scale=[(0, "red"), (0.5, "orange"), (1, "green")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
query_arr = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Annee"]==2020) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query_arr = query_arr.groupby("Nom departement")["Commune"].count().reset_index()
query_dep = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Annee"]==2019) & (~df_geo["Nom region"].isin(["Guadeloupe", "Martinique", "Guyane", "La Réunion"]))]
query_dep = query_dep.groupby("Nom departement")["Commune"].count().reset_index()
query_arr.rename(columns={"Valeur fonciere": "Valeur moyenne 2020"}, inplace=True)
query = pd.merge(query_dep, query_arr, on="Nom departement")
query["Taux d'évolution"] = query.apply(lambda x: calcul_taux_evolution(x["Commune_x"], x["Commune_y"]), axis=1)
fig = px.choropleth_mapbox(query, departements_geojson, color="Taux d'évolution", locations="Nom departement", featureidkey="properties.nom",
center={"lat": 46.2321, "lon": 2.2096},
mapbox_style="carto-positron", zoom=4.5, opacity=0.7,
hover_name="Nom departement",
color_continuous_scale=[(0, "red"), (0.5, "orange"), (1, "green")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
def map_chart_gamme_appartement_Paris(gamme, annee):
def extract_c_ar(x):
return int(x.split(" ")[1])
with open("arrondissements_paris.geojson") as f:
paris_geojson=json.load(f)
PARIS = df_geo[df_geo["Nom departement"]=="Paris"]
query = PARIS[(PARIS["Type local"]=="Appartement") & (PARIS["Annee"]==annee) & (PARIS["Surface terrain"].isna())]
query["Prix_m²"] = query["Valeur fonciere"] / query["Surface reelle bati"]
if gamme == "studio":
query = query[query["Surface reelle bati"] < 32]
elif gamme == "prestige":
query = query[query["Surface reelle bati"] > 200]
elif gamme == "standard":
query = query[query["Surface reelle bati"].between(32, 95)]
else:
query = query[query["Surface reelle bati"].between(95, 200)]
query = query.groupby("Commune")["Prix_m²"].mean().reset_index()
query["c_ar"] = query["Commune"].map(extract_c_ar)
fig = px.choropleth_mapbox(query, paris_geojson, color="Prix_m²", locations="c_ar", featureidkey="properties.c_ar",
center={"lat": 48.8566, "lon": 2.3522},
mapbox_style="carto-positron", zoom=11, opacity=0.7,
labels={"Prix_m²": f"Prix m² moyen"},
hover_name="Commune",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
gamme = "studio"
annee = 2019
map_chart_gamme_appartement_Paris(gamme, annee)
def map_chart_gamme_appartement_Marseille(gamme, annee):
marseille_code_postal =[i for i in range(13001, 13017)]
MARSEILLE = df_geo[df_geo["Code postal"].isin(marseille_code_postal)]
with open("arrondissements_marseille.geojson") as f:
marseille_geojson=json.load(f)
def extract_code_postal_marseille(x):
x = x.split(" ")[1]
dico = {
"10EME": "13210",
"11EME": "13211",
"12EME": "13212",
"13EME": "13213",
"14EME": "13214",
"15EME": "13215",
"16EME": "13216",
"1ER": "13201",
"2EME": "13202",
"3EME": "13203",
"4EME": "13204",
"5EME": "13205",
"6EME": "13206",
"7EME": "13207",
"8EME": "13208",
"9EME": "13209"
}
return dico[x]
query = MARSEILLE[(MARSEILLE["Type local"]=="Appartement") & (MARSEILLE["Annee"]==annee) & (MARSEILLE["Surface terrain"].isna())]
query["Prix_m²"] = query["Valeur fonciere"] / query["Surface reelle bati"]
#query["code_postal"] = query["Code postal"].map(lambda x: str(int(x)+200))
if gamme == "studio":
query = query[query["Surface reelle bati"] < 32]
elif gamme == "prestige":
query = query[query["Surface reelle bati"] > 200]
elif gamme == "standard":
query = query[query["Surface reelle bati"].between(32, 95)]
else:
query = query[query["Surface reelle bati"].between(95, 200)]
query = query.groupby("Commune")["Prix_m²"].mean().reset_index()
query["code_postal"] = query["Commune"].map(extract_code_postal_marseille)
fig = px.choropleth_mapbox(query, marseille_geojson, color="Prix_m²", locations="code_postal", featureidkey="properties.code_postal",
center={"lat": 43.2964, "lon": 5.3697},
mapbox_style="carto-positron", zoom=10, opacity=0.7,
labels={"Prix_m²": f"Prix m² moyen"},
hover_name="Commune",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
gamme = "studio"
annee = 2019
map_chart_gamme_appartement_Marseille(gamme, annee)
def map_chart_gamme_appartement_Lyon(gamme, annee):
lyon_code_postal = [i for i in range(69001, 69010)]
LYON = df_geo[df_geo["Code postal"].isin(lyon_code_postal)]
with open("arrondissements_lyon.geojson") as f:
lyon_geojson=json.load(f)
query = LYON[(LYON["Type local"]=="Appartement") & (LYON["Annee"]==annee) & (LYON["Surface terrain"].isna())]
query["Prix_m²"] = query["Valeur fonciere"] / query["Surface reelle bati"]
if gamme == "studio":
query = query[query["Surface reelle bati"] < 32]
elif gamme == "prestige":
query = query[query["Surface reelle bati"] > 200]
elif gamme == "standard":
query = query[query["Surface reelle bati"].between(32, 95)]
else:
query = query[query["Surface reelle bati"].between(95, 200)]
query = query.groupby("Commune")["Prix_m²"].mean().reset_index()
fig = px.choropleth_mapbox(query, lyon_geojson, color="Prix_m²", locations="Commune", featureidkey="properties.nom",
center={"lat": 45.7603, "lon": 4.8496},
mapbox_style="carto-positron", zoom=11, opacity=0.7,
labels={"Prix_m²": f"Prix m² moyen"},
hover_name="Commune",
color_continuous_scale=[(0, "green"), (0.5, "orange"), (1, "red")])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
gamme = "studio"
annee = 2019
map_chart_gamme_appartement_Lyon(gamme, annee)
df_geo.head()
| No disposition | Date mutation | Nature mutation | Valeur fonciere | No voie | B/T/Q | Type de voie | Code voie | Voie | Code postal | Commune | Code departement | Code commune | Prefixe de section | Section | No plan | No Volume | 1er lot | Surface Carrez du 1er lot | 2eme lot | Surface Carrez du 2eme lot | 3eme lot | Surface Carrez du 3eme lot | 4eme lot | Surface Carrez du 4eme lot | 5eme lot | Surface Carrez du 5eme lot | Nombre de lots | Code type local | Type local | Surface reelle bati | Nombre pieces principales | Nature culture | Nature culture speciale | Surface terrain | Nom departement | Code region | Nom region | Nom zone | Mois | Annee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.000 | 2019-04-01 | Vente | 37220.000 | 26.000 | NaN | RUE | 2730 | DE MONTHOLON | 1000.000 | BOURG-EN-BRESSE | 01 | 53.000 | NaN | AI | 298.000 | NaN | 8.000 | 19.270 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.000 | 2.000 | Appartement | 20.000 | 1.000 | NaN | NaN | NaN | Ain | 84 | Auvergne-Rhône-Alpes | Montagne & Massif | Avril | 2019 |
| 1 | 1.000 | 2019-04-01 | Vente | 185100.000 | 22.000 | B | RUE | 1650 | GEN DELESTRAINT | 1000.000 | BOURG-EN-BRESSE | 01 | 53.000 | NaN | AM | 95.000 | NaN | 7.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.000 | 3.000 | Dépendance | 0.000 | 0.000 | NaN | NaN | NaN | Ain | 84 | Auvergne-Rhône-Alpes | Montagne & Massif | Avril | 2019 |
| 2 | 1.000 | 2019-04-01 | Vente | 185100.000 | 22.000 | NaN | RUE | 1650 | GEN DELESTRAINT | 1000.000 | BOURG-EN-BRESSE | 01 | 53.000 | NaN | AM | 95.000 | NaN | 137.000 | NaN | 154.000 | 61.510 | NaN | NaN | NaN | NaN | NaN | NaN | 2.000 | 2.000 | Appartement | 62.000 | 3.000 | NaN | NaN | NaN | Ain | 84 | Auvergne-Rhône-Alpes | Montagne & Massif | Avril | 2019 |
| 3 | 1.000 | 2019-08-01 | Vente | 209000.000 | 3.000 | NaN | RUE | 0043 | DES CHAMPAGNES | 1160.000 | PRIAY | 01 | 314.000 | NaN | E | 1676.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000 | 1.000 | Maison | 90.000 | 4.000 | S | NaN | 940.000 | Ain | 84 | Auvergne-Rhône-Alpes | Montagne & Massif | Août | 2019 |
| 4 | 1.000 | 2019-07-01 | Vente | 134900.000 | 5.000 | NaN | LOT | A003 | LE BIOLAY | 1370.000 | SAINT-ETIENNE-DU-BOIS | 01 | 350.000 | NaN | AA | 11.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000 | 1.000 | Maison | 101.000 | 5.000 | S | NaN | 490.000 | Ain | 84 | Auvergne-Rhône-Alpes | Montagne & Massif | Juillet | 2019 |
query = df_geo[(df_geo["Type local"]=="Appartement") & (df_geo["Annee"]==2021)]
fig = px.scatter(data_frame=query, x="Nombre pieces principales", y="Surface reelle bati")
fig.show()
query = df_geo[(df_geo["Nature mutation"]=="Vente")]
query = query.groupby("Date mutation")["Commune"].count().reset_index()
query.rename(columns={"Commune": "Nb transactions", "Date mutation": "Date"}, inplace=True)
fig = px.line(data_frame=query, x="Date", y="Nb transactions")
fig.update_xaxes(rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])))
fig.update_layout(title="Plot chart sur le nombre de transactions par jour",
xaxis_title="Date",
yaxis_title="Nombre de transactions",
title_x=0.5,
title_y=1)
fig.show()
query_final = pd.DataFrame(df_geo["Mois"].unique(),columns=["Mois"])
for annee in df_geo["Annee"].unique():
query = df_geo[(df_geo["Nature mutation"]=="Vente") & (df_geo["Annee"]==annee)]
query = query.groupby(["Mois"])["Commune"].count().reset_index()
query.rename(columns={"Commune": str(annee)}, inplace=True)
query_final = pd.merge(query_final, query, on="Mois")
query_final["Mois"] = query_final["Mois"].map(lambda x: NUM_MOIS[x])
query_final.sort_values(by="Mois", inplace=True)
query_final["Mois"] = query_final["Mois"].map(lambda x: NOM_MOIS[x])
fig = px.line(query_final, x="Mois", y=query_final.columns)
fig.update_layout(title="Plot chart sur le nombre de transactions par mois",
xaxis_title="Mois",
yaxis_title="Nombre de transactions",
title_x=0.5)
fig.show()
query_final = pd.DataFrame(df_geo["Mois"].unique(),columns=["Mois"])
for annee in df_geo["Annee"].unique():
query = df_geo[(df_geo["Nature mutation"]=="Vente") & (df_geo["Annee"]==annee)]
query = query.groupby(["Mois"])["Commune"].count().reset_index()
query["Mois"] = query["Mois"].map(lambda x: NUM_MOIS[x])
query.sort_values(by="Mois", inplace=True)
query["Mois"] = query["Mois"].map(lambda x: NOM_MOIS[x])
query["Commune"] = query["Commune"].cumsum()
query.rename(columns={"Commune": str(annee)}, inplace=True)
query_final = pd.merge(query_final, query, on="Mois")
query_final["Mois"] = query_final["Mois"].map(lambda x: NUM_MOIS[x])
query_final.sort_values(by="Mois", inplace=True)
query_final["Mois"] = query_final["Mois"].map(lambda x: NOM_MOIS[x])
fig = px.line(query_final, x="Mois", y=query_final.columns)
fig.update_layout(title="Plot chart sur le YTD du nombre de transactions",
xaxis_title="Mois",
yaxis_title="Nombre de transactions",
title_x=0.5)
fig.show()
def titre_transaction(nature, type_local, pieces):
return str(nature) + " " + str(type_local) + " avec " + str(pieces) + "pièces"
n=20
annee=2021
query = df_geo[(df_geo["Type local"].isin(["Maison", "Appartement"])) & (df_geo["Annee"]==annee)]
query = query.groupby(["Nature mutation", "Type local", "Nombre pieces principales", "Nature culture"])["Commune"].count().reset_index()
query.sort_values(by="Commune", inplace=True, ascending=False)
query.rename(columns={"Commune": "Nb transactions"}, inplace=True)
query["Transaction"] = query.apply(lambda x: titre_transaction(x["Nature mutation"], x["Type local"], x["Nombre pieces principales"]), axis=1)
query = query[["Transaction", "Nb transactions"]].head(n)
fig = px.bar(query, x="Nb transactions", y="Transaction", color="Transaction")
fig.update_layout(title=f"Bar chart sur la fréquence d'occurence du top {n} de transactions lors de l'année {annee}",
xaxis_title="Nb transactions",
yaxis_title="Transactions",
title_x=0.5,
showlegend=False)
fig.show()
n=20
annee=2019
query = df_geo[(df_geo["Type local"].isin(["Maison", "Appartement"])) & (df_geo["Annee"]==annee)]
query = query.groupby(["Nature mutation", "Type local", "Nombre pieces principales", "Nature culture"])["Commune"].count().reset_index()
query.sort_values(by="Commune", inplace=True, ascending=False)
query.rename(columns={"Commune": "Nb transactions"}, inplace=True)
query["Transaction"] = query.apply(lambda x: titre_transaction(x["Nature mutation"], x["Type local"], x["Nombre pieces principales"]), axis=1)
query = query[["Transaction", "Nb transactions"]].head(n)
fig = px.bar(query, x="Nb transactions", y="Transaction", color="Transaction")
fig.update_layout(title=f"Bar chart sur la fréquence d'occurence du top {n} de transactions lors de l'année {annee}",
xaxis_title="Nb transactions",
yaxis_title="Transactions",
title_x=0.5,
showlegend=False)
fig.show()